rm(list=ls(all=TRUE))
pacman::p_load(vcd, magrittr, readr, caTools, ggplot2, dplyr, plotly, ggrepel)
load("data/tf0.rdata")
sapply(list(cust=A0,tid=X0,items=Z0), nrow)## cust tid items
## 32241 119328 817182
Z0$date = as.Date(Z0$date, format="%m/%d/%Y")
hist(Z0$date,'weeks',freq=T,las=2)
主要發現:
● 主要發現一月銷售量相較於其他月份銷售量更突出,推估為過年買氣較旺,可根據此檔期做行銷策略專案。
● 從圖中可發現12月的第四周的交易量較少,可以推測該週可能是因為店內整修,有部分天數店休,導致交易量驟降。
ggplot(Z0) +geom_bar(aes(x = age, fill = age))options(scipen=999)#不要科學記號
ggplot(Z0) +geom_bar(aes(x = area, fill = area)) 我們得知Ta Feng量販店的會員集中在汐止區與南港區,因此推估商店舖座落在兩區之間。
假設大豐超市位於南港區global mall商城內的國際商品生鮮超市,
引進進口產品,提高高質量零食與生鮮蔬果商品。
超市會員主要多為30-50歲青、壯年年齡居多,推估此年齡層有一定的基本經濟能力。
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
MOSA(~area+age, A0)主要發現:
● 主要顧客來源為南港區及汐止區,信義區及內湖區為其次
● 顧客中30~44歲的年齡層較多
● 南港區較多25歲以下學生族群,較少30~40歲壯年客群
● 汐止區30~40歲的顧客比率比較高
X0$wday = format(X0$date, "%u")
ht <- count(X0, age, wday)
X0A <- merge(X0,ht)
ggplot(X0A, aes(X0A$wday,X0A$age)) +
geom_tile(aes(fill = n),colour = "white")## Warning: Use of `X0A$wday` is discouraged. Use `wday` instead.
## Warning: Use of `X0A$age` is discouraged. Use `age` instead.
scale_fill_gradient(low = "white",high = "steelblue", limits = c(0, 6000))## <ScaleContinuous>
## Range:
## Limits: 0 -- 6e+03
假日的各年齡層購買力較平日強,尤其30~39歲族群最顯著
A0 %>% group_by(age) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(f), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")mean(A0$age == "a99")## [1] 0.01941627
由於a99(沒有年齡資料的顧客)人數不多,而且特徵很獨特,探索時我們可以考慮濾掉這群顧客
A0 %>% filter(age!="a99") %>% # 濾掉沒有年齡資料的顧客('a99')
group_by(age) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(f), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")A0 %>% filter(age!="a99") %>% # 濾掉沒有年齡資料的顧客('a99')
group_by(area) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(f), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
geom_text(aes(label=area)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("地理區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")情境推測:
1. 假設Ta Feng為位於南港區的購物中心內的量販店
2. 住較近學生及上班族群下班下課後的聚會地點
3. 因地緣關係,信義區跟內湖區等顧客可能只在假日特地來購物商城逛街補貨,因此單次購買量較高
4. 30~50歲平均購買單價較其他年齡層高,推測經濟能力較好
prodA = Z0 %>% group_by(prod) %>% summarise(
totalQty = sum(qty),
totalRev = sum(price),
totalGross = sum(price) - sum(cost),
grossMargin = totalGross/totalRev,
avgPrice = totalRev/totalQty
)
prodA## # A tibble: 23,789 x 6
## prod totalQty totalRev totalGross grossMargin avgPrice
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0002884011363 1 279 79 0.283 279
## 2 0008635012177 167 18474 3611 0.195 111.
## 3 0008635099680 30 2710 580 0.214 90.3
## 4 0010742201412 27 2101 400 0.190 77.8
## 5 0010742201610 30 2316 426 0.184 77.2
## 6 0010742201719 17 1331 260 0.195 78.3
## 7 0010742201818 21 1651 328 0.199 78.6
## 8 0010742202112 3 237 48 0.203 79
## 9 0010742202211 28 2172 408 0.188 77.6
## 10 0010742206615 35 2675 470 0.176 76.4
## # ... with 23,779 more rows
prod1 = Z0 %>% group_by(prod) %>% summarise(
noProd = n_distinct(prod),
totalQty = sum(qty),
totalRev = sum(price),
totalGross = sum(price) - sum(cost),
grossMargin = totalGross/totalRev,
avgPrice = totalRev/totalQty
)
prod1 <- prod1 %>% arrange(desc(totalQty,avgPrice)) %>% head(10)
ggplot(prod1, aes(x = avgPrice, y = totalQty))+geom_point(aes(col = totalGross,size = totalRev))+scale_color_gradientn(colors=c("seagreen","gold","red"))+geom_text_repel(aes(avgPrice, totalQty, label = prod))top2000 = prodA %>% top_n(2000, totalRev)
g1=ggplot(top2000, aes(x=totalRev, y=avgPrice, col=prod)) +
geom_point()
ggplotly(g1)我們發現商品平均售價不高,同時我們也可以發現最暢銷的產品不一定會賺錢,因此大豐超市需要推廣更多毛利較高的產品,才能創造更好的營收。
cats = Z0 %>% group_by(cat) %>% summarise(
noProd = n_distinct(prod),
totalQty = sum(qty),
totalRev = sum(price),
totalGross = sum(price) - sum(cost),
grossMargin = totalGross/totalRev,
avgPrice = totalRev/totalQty
)#商品探索圖
top_cat <- cats %>% arrange(desc(totalGross)) %>% head(10)
top_cat$cat <- as.factor(top_cat$cat)
ggplot(top_cat, aes(x=cat, y=avgPrice, fill=cat))+geom_col()test2 <- Z0 %>% filter(age %in% c("a34","a39","a44","a49"))
test3 <- test2 %>%
group_by(prod) %>%
summarise(prod_qty = sum(qty)) %>%
arrange(desc(prod_qty))
test4 <-merge(test2,test3)t1 <-test3 %>% head(10)
t2 <- merge(test2,t1)
t2$cat <- as.factor(t2$cat)
ggplot(t2, aes(x=age, y=qty, fill=cat)) + geom_col()# 對品類(`category`)做彙總
cattest = Z0 %>%
group_by(cat) %>% #根據每個品項每月會有一個值
summarise(
totalQty = sum(qty),
totalRev = sum(price),
totalGross = sum(price) - sum(cost),
grossMargin = totalGross/totalRev,
avgPrice = totalRev/totalQty
) %>%
arrange(cat)
cattest2 = cattest %>% as.data.frame
cattest2 = cattest2 %>% arrange(desc(totalGross)) %>% head(50)cattest2$cat <- as.factor(cattest2$cat)
a=qplot(x = avgPrice,
y = totalGross ,
data= cattest2,
color = cat,
size = totalGross)
options(scipen=999)#不要科學記號
ggplotly(a)320402為單價高但毛利最好的產品
rev)最大的100個品類與平均價格col6 = c('seagreen','gold','orange',rep('red',3))
gg2= group_by(Z0, cat) %>% summarise(
solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost),
profit = rev - cost, margin = 100*profit/rev , avg_price = rev/qty
) %>%
top_n(100, profit) %>%
ggplot(aes(x=margin, y=rev, col=profit, label=cat, label2=avg_price)) +
geom_point(size=2,alpha=0.8) + scale_y_log10() +
scale_color_gradientn(colors=col6) +
theme_bw()
ggplotly(gg2)探查此區高毛利率但營收不高的前五名品類 #501002,501001,500903,500705,560336
catA1 = subset(Z0, cat=="501001") #篩選出501001品類資料
catA2 = catA1 %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(catA2) ## [1] 1112
sapply(catA2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值## items pieces total gross
## 99.9% 4 18.2230 1788.204 464.2390
## 99.95% 4 19.4445 1889.446 533.8955
## 99.99% 4 19.8889 1897.889 586.7791
catA2 = subset(catA2, items<=4 & pieces<19.889 & total<1897.9,gross<586.78) par(cex=0.8)
hist(catA2$date, "weeks", freq=T, las=2, main="cat501001 per Week")X0$wday = format(X0$date, "%u") #cat501001購買年齡層與週間
catA3 <- merge(catA2,X0)
catA31 <- count(catA3, age, wday)
catA4<- merge(catA31,catA3)
A4=ggplot(catA4, aes(catA4$wday,catA4$age)) +
geom_tile(aes(fill = n),colour = "white")+
scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(-10,10 ))+ theme_bw()
A4## Warning: Use of `catA4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catA4$age` is discouraged. Use `age` instead.
cat501001在週六的購買情況較好
catB1 = subset(Z0, cat=="501002") #篩選出501002品類資料
catB2 = catB1 %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(catB2) ## [1] 1298
sapply(catB2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值## items pieces total gross
## 99.9% 3 6 788.5860 228.4360
## 99.95% 3 6 841.7985 252.7385
## 99.99% 3 6 893.1597 283.3477
catB2 = subset(catB2, items<=3 & pieces<6 & total<893.16,gross<283.35) par(cex=0.8)
hist(catB2$date, "weeks", freq=T, las=2, main="cat501002 per Week")X0$wday = format(X0$date, "%u") #cat501002購買年齡層與週間
catB3 <- merge(catB2,X0)
catB31 <- count(catB3, age, wday)
catB4<- merge(catB31,catB3)
B4=ggplot(catB4, aes(catB4$wday,catB4$age)) +
geom_tile(aes(fill = n),colour = "white")+
scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
B4## Warning: Use of `catB4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catB4$age` is discouraged. Use `age` instead.
cat501002在週一及週三的購買情況較好
catC1 = subset(Z0, cat=="500903") #篩選出500903品類資料
catC2 = catC1 %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(catC2) ## [1] 1024
sapply(catC2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值## items pieces total gross
## 99.9% 3.0000 8.0000 2367.713 678.297
## 99.95% 3.4885 15.8160 2942.005 880.331
## 99.99% 3.8977 22.3632 3404.401 1046.466
catC2 = subset(catC2, items<=3.8977 & pieces<=22.363 & total<3404.4,gross<1046.47) par(cex=0.8)
hist(catC2$date, "weeks", freq=T, las=2, main="cat500903 per Week")X0$wday = format(X0$date, "%u") #cat500903購買年齡層與週間
catC3 <- merge(catC2,X0)
catC31 <- count(catC3, age, wday)
catC4<- merge(catC31,catC3)
C4=ggplot(catC4, aes(catC4$wday,catC4$age)) +
geom_tile(aes(fill = n),colour = "white")+
scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
C4## Warning: Use of `catC4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catC4$age` is discouraged. Use `age` instead.
catD1 = subset(Z0, cat=="500705") #篩選出500705品類資料
catD2 = catD1 %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(catD2) ## [1] 525
sapply(catD2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值## items pieces total gross
## 99.9% 5.4760 29.520 4739.436 1365.768
## 99.95% 5.7380 34.760 5580.718 1553.884
## 99.99% 5.9476 38.952 6253.744 1704.377
catD2 = subset(catD2, items<=5.9476 & pieces<=38.95 & total<6253.7,gross<1704.4) par(cex=0.8)
hist(catD2$date, "weeks", freq=T, las=2, main="cat500705 per Week")X0$wday = format(X0$date, "%u") #cat500705購買年齡層與週間
catD3 <- merge(catD2,X0)
catD31 <- count(catD3, age, wday)
catD4<- merge(catD31,catD3)
D4=ggplot(catD4, aes(catD4$wday,catD4$age)) +
geom_tile(aes(fill = n),colour = "white")+
scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
D4## Warning: Use of `catD4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catD4$age` is discouraged. Use `age` instead.
catE1 = subset(Z0, cat=="560336") #篩選出560336品類資料
catE2 = catE1 %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(catE2) ## [1] 1138
sapply(catE2[,6:9], quantile, prob=c(.999, .9995, .9999)) #處理離群值## items pieces total gross
## 99.9% 3.0000 5.8630 580.7670 162.9450
## 99.95% 3.4315 6.8630 790.4145 201.6775
## 99.99% 3.8863 7.7726 1010.0829 240.3355
catE2 = subset(catE2, items<=3.88 & pieces<=7.77 & total<1010.08,gross<240.34) par(cex=0.8)
hist(catE2$date, "weeks", freq=T, las=2, main="cat560336 per Week")X0$wday = format(X0$date, "%u") #cat560336購買年齡層與週間
catE3 <- merge(catE2,X0)
catE31 <- count(catE3, age, wday)
catE4<- merge(catE31,catE3)
E4=ggplot(catE4, aes(catE4$wday,catE4$age)) +
geom_tile(aes(fill = n),colour = "white")+
scale_fill_gradient2(midpoint = 1, mid = "lightblue", limits = c(0,5 ))+ theme_bw()
E4## Warning: Use of `catE4$wday` is discouraged. Use `wday` instead.
## Warning: Use of `catE4$age` is discouraged. Use `age` instead.
pacman::p_load(Matrix, arules, arulesViz)Z0$tid %>% n_distinct## [1] 119422
Z0$cat %>% n_distinct## [1] 2007
p = count(Z0, cat, sort=T)pk = p$cat[1:2007]
Z = filter(Z0, cat %in% pk)
tr = as(split(Z[,"cat"], Z[,"tid"]), "transactions"); tr## Warning in asMethod(object): removing duplicated items in transactions
## transactions in sparse format with
## 119422 transactions (rows) and
## 2007 items (columns)
rules <- apriori(tr, parameter=list(supp=0.00005, conf=0.5))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.00005 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2007 item(s), 119422 transaction(s)] done [0.16s].
## sorting and recoding items ... [1741 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [2.42s].
## writing ... [584757 rule(s)] done [0.25s].
## creating S4 object ... done [0.31s].
summary(rules)## set of 584757 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7 8
## 55 14701 219926 269094 73683 7050 248
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 5.000 4.725 5.000 8.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.00005024 Min. :0.5000 Min. :0.00005024 Min. : 4.068
## 1st Qu.:0.00005024 1st Qu.:0.5455 1st Qu.:0.00007536 1st Qu.: 6.358
## Median :0.00005862 Median :0.6000 Median :0.00009211 Median : 9.537
## Mean :0.00006622 Mean :0.6470 Mean :0.00010729 Mean : 13.516
## 3rd Qu.:0.00006699 3rd Qu.:0.7368 3rd Qu.:0.00011723 3rd Qu.: 13.897
## Max. :0.00617139 Max. :1.0000 Max. :0.01216694 Max. :2985.550
## count
## Min. : 6.000
## 1st Qu.: 6.000
## Median : 7.000
## Mean : 7.908
## 3rd Qu.: 8.000
## Max. :737.000
##
## mining info:
## data ntransactions support confidence
## tr 119422 0.00005 0.5
我們運用購物籃分析法,尋找消費者
cat5 <- c("501002","500705","560336","501001","500903")
rx = subset(rules, subset = lift > 5 & count > 7 & rhs %in% cat5)
inspect(rx)## lhs rhs support confidence coverage
## [1] {560351} => {560336} 0.00032657299 0.5000000 0.0006531460
## [2] {560201,560327} => {560336} 0.00006698933 0.5000000 0.0001339787
## [3] {560104,560351} => {560336} 0.00006698933 0.6666667 0.0001004840
## [4] {560351,560402} => {560336} 0.00007536300 0.7500000 0.0001004840
## [5] {560201,560351} => {560336} 0.00012560500 0.5172414 0.0002428363
## [6] {560334,570311} => {560336} 0.00006698933 0.6153846 0.0001088577
## [7] {560316,570206} => {560336} 0.00007536300 0.5625000 0.0001339787
## [8] {560316,560334} => {560336} 0.00008373666 0.5000000 0.0001674733
## [9] {560316,560339} => {560336} 0.00007536300 0.5294118 0.0001423523
## [10] {560337,570206} => {560336} 0.00008373666 0.5263158 0.0001590997
## [11] {560337,570306} => {560336} 0.00006698933 0.5000000 0.0001339787
## [12] {560334,560337} => {560336} 0.00008373666 0.5000000 0.0001674733
## [13] {560330,560335} => {560336} 0.00007536300 0.6000000 0.0001256050
## [14] {560322,560334} => {560336} 0.00007536300 0.5000000 0.0001507260
## [15] {100212,501001} => {501002} 0.00006698933 0.5333333 0.0001256050
## [16] {560201,560334,570306} => {560336} 0.00006698933 0.6153846 0.0001088577
## [17] {530110,560201,560339} => {560336} 0.00006698933 0.5000000 0.0001339787
## lift count
## [1] 52.47012 39
## [2] 52.47012 8
## [3] 69.96016 8
## [4] 78.70518 9
## [5] 54.27944 15
## [6] 64.57861 8
## [7] 59.02889 9
## [8] 52.47012 10
## [9] 55.55660 9
## [10] 55.23171 10
## [11] 52.47012 8
## [12] 52.47012 10
## [13] 62.96415 9
## [14] 52.47012 9
## [15] 49.06913 8
## [16] 64.57861 8
## [17] 52.47012 8
我們希望提高毛利率高但銷售量少的商品的購買率,
因此藉由購物籃的分析法尋找商品間的關聯性,
我們發現購買#560201可以提高#560336的購買率,
而購買#501002可以提高#5601001的購買率,於是可以用在我們的行銷策略上。